'''
FM(因子分解机)模型算法：稀疏数据下的特征二阶组合问题（个性化特征）
1、应用矩阵分解思想，引入隐向量构造FM模型方程
2、目标函数（损失函数复合FM模型方程）的最优问题：链式求偏导
3、SGD优化目标函数
'''
import pandas as pd
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from _datetime import datetime
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import str


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

'''二分类输出非线性映射'''
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

'''计算logit损失函数：L = log(1 + e**(y_hat * y))'''
def logit(y, y_hat):
    return np.log(1 + np.exp(-y * y_hat))
def error(a,b):
    f=0
    if a!=b:
        f=f+1
    else:
        f=f
    return f

'''计算logit损失函数的外层偏导(不含y_hat的一阶偏导)'''
def df_logit(y, y_hat):
    return -y/ (1 + np.exp(y * y_hat))

'''FM的模型方程：LR线性组合 + 交叉项组合 = 1阶特征组合 + 2阶特征组合'''
def FM(Xi, w0, W, V):
    # 样本Xi的特征分量xi和xj的2阶交叉项组合系数wij = xi和xj对应的隐向量Vi和Vj的内积
    # 向量形式：Wij:= <Vi, Vj> * Xi * Xj
    interaction = np.sum((Xi.dot(V)) ** 2 - (Xi ** 2).dot(V ** 2))  # 二值硬核匹配->向量软匹配
    y_hat = w0 + Xi.dot(W) + interaction / 2  # FM预测函数
    return y_hat[0]

'''SGD更新FM模型的参数列表：[w0, W, V]'''
def FM_SGD(X, y, k=2, alpha=0.01, iter=50):
    m, n = np.shape(X)
    w0, W = 0, np.zeros((n, 1))  # 初始化wo=R、W=(n, 1)
    V = np.random.normal(loc=0, scale=1, size=(n, k))  # 初始化隐向量矩阵V=(n, k)~N(0, 1)，其中Vj是第j维特征的隐向量
    all_FM_params = []  # FM模型的参数列表：[w0, W, V]

    for it in range(iter):
        all_FM_params2 = []  # FM模型的参数列表：[w0, W, V]
        total_loss = 0  # 当前迭代模型的损失值
        error1=0
        predicts=0
        for i in range(m):  # 遍历训练集
            y_hat = FM(Xi=X[i], w0=w0, W=W, V=V)  # FM的模型方程
            total_loss += logit(y=y[i], y_hat=y_hat)  # 计算logit损失函数值
            #predicts=FM_predict(X=X_test, w0=w0, W=W, V=V)
            #error1 +=error(a=y[i],b=predicts[-1])
            """if y[i]!= predicts[-1]:
                    error += 1
            else:
                    error += 0"""
            dloss = df_logit(y=y[i], y_hat=y_hat)  # 计算logit损失函数的外层偏导
            dloss_w0 = dloss * 1  # l(y, y_hat)中y_hat展开w0，求关于w0的内层偏导
            w0 = w0 - alpha * dloss_w0  # 梯度下降更新w0
            for j in range(n):  # 遍历n维向量X[i]
                if X[i, j] != 0:
                    dloss_Wj = dloss * X[i, j]  # l(y, y_hat)中y_hat展开y_hat，求关于W[j]的内层偏导
                    W[j] = W[j] - alpha * dloss_Wj  # 梯度下降更新W[j]
                    for f in range(k):  # 遍历k维隐向量Vj
                        # l(y, y_hat)中y_hat展开V[j, f]，求关于V[j, f]的内层偏导
                        dloss_Vjf = dloss * (X[i, j] * (X[i].dot(V[:, f])) - V[j, f] * X[i, j] ** 2)
                        V[j, f] = V[j, f] - alpha * dloss_Vjf  # 梯度下降更新V[j, f]
        """with open('学习率为{}train mes.csv'.format(alpha),'a',encoding='gbk') as f:
                f.write('FM第{}次迭代，当前训练集损失值为：{:.4f}'.format(it+1, total_loss / m)+'\n')
        if (it-1) % 20 ==0:
             print('学习率为{}的FM第{}次迭代，当前损失值为：{:.4f}'.format(alpha,it, total_loss / m))"""
             #print('FM第{}次迭代，当前错误率为：{:.4f}'.format(it, error1 / m))
        all_FM_params.append([w0, W, V])  # 保存当前迭代下FM的参数列表:[w0, W, V]
        #all_FM_params2.append([w0, W, V])  # 保存当前迭代下FM的参数列表:[w0, W, V]
        #return all_FM_params2
    return all_FM_params

def SS(X_test, y_test,y_hat,  iter=50):
    X_test=np.array(X_test)
    y_test= np.array(y_test)
    y_hat = np.array(y_hat)
    m_test, n_test = np.shape(X_test)
    for it in range(iter):
        total_loss_test = 0  # 当前迭代模型的损失值
        for i in range(m_test):  # 遍历训练集
            #y_hat = FM(Xi=X[i], w0=w0, W=W, V=V)  # FM的模型方程
            total_loss_test += logit(y=y_test[i], y_hat=y_hat[i])  # 计算logit损失函数值
        with open('学习率为{}test mes.csv','a',encoding='gbk') as f:
            f.write('FM第{}次迭代，当前测试集损失值为：{:.4f}'.format(it+1, total_loss_test /m_test)+'\n')

'''FM模型预测测试集分类结果'''
def FM_predict(X, w0, W, V):
    predicts, threshold = [], 0.5  # sigmoid阈值设置
    for i in range(X.shape[0]):  # 遍历测试集

        y_hat = FM(Xi=X[i], w0=w0, W=W, V=V)  # FM的模型方程
        predicts.append(-1 if sigmoid(y_hat) < threshold else 1)  # 分类结果非线性映射
        """if  sigmoid(y_hat)< threshold:
            Y_hat=-1
        else:
            Y_hat=1"""
    return np.array(predicts)

'''FM在不同迭代次数下的参数列表中，训练集的损失值和测试集的准确率变化'''
def draw_research(all_FM_params, X_train, y_train, X_test, y_test):
    all_total_loss, all_total_accuracy = [], []
    for w0, W, V in all_FM_params:
        total_loss = 0
        for i in range(X_train.shape[0]):
            total_loss += logit(y=y_train[i], y_hat=FM(Xi=X_train[i], w0=w0, W=W, V=V))
        all_total_loss.append(total_loss / X_train.shape[0])
        all_total_accuracy.append(accuracy_score(y_test, FM_predict(X=X_test, w0=w0, W=W, V=V)))
    plt.plot(np.arange(len(all_FM_params)), all_total_loss, color='#FF4040', label='训练集的损失值')
    plt.plot(np.arange(len(all_FM_params)), all_total_accuracy, color='#4876FF', label='测试集的准确率')
    plt.plot(np.arange(len(all_FM_params)), all_total_accuracy, color='green', label='测试集的准确率')
    plt.xlabel('迭代次数')
    plt.title('FM模型:二阶互异特征组合')
    plt.legend()
    plt.show()


#ROC曲线的绘制
def ROC(Y_val, y_pred):
    from sklearn.metrics import roc_curve, auc  ###计算roc和auc
    import matplotlib.pyplot as plt
    fpr,tpr,threshold = roc_curve(Y_val, y_pred) ###计算真正率和假正率
    roc_auc = auc(fpr,tpr) ###计算auc（roc曲线的面积）的值
    print("AUC:",roc_auc)#打印出roc面积
    plt.figure()
    lw = 2#线的粗细
    plt.figure(figsize=(10,10))
    #画出曲线fpr,tpr,显示颜色，在图画内显示标签，%占位符（感觉类似于format)
    plt.plot(fpr, tpr, color='green',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
    #画出直线（[0,1],[0,1])这表示画出x=y的一条直线，lw线的粗细
    plt.plot([0, 1], [0, 1], color='yellow', lw=lw, linestyle='--')
    #定义轴的大小
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.show()


#P-R曲线的绘制
def P_R(Y_val, y_pred):
    from sklearn.metrics import precision_recall_curve
    import matplotlib.pyplot as plt

    precision, recall, _ = precision_recall_curve(Y_val, y_pred)

    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2,
                     color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('P_R')
    plt.show()


#Kappa系数
def Kappa(Y_val, y_pred):
    from sklearn.metrics import cohen_kappa_score
    kappa = cohen_kappa_score(Y_val, y_pred) #(label除非是你想计算其中的分类子集的kappa系数，否则不需要设置)
    print("Kappa系数:",kappa)

#海明距离
def ham_distance(Y_val, y_pred):
    from sklearn.metrics import hamming_loss
    ham_distance = hamming_loss(Y_val, y_pred)
    print("海明距离：",ham_distance)

def f1_score(precision, recall):#精准率与召回率的调和平均
    try:
        return 2 * precision * recall / (precision + recall)
    except:
        return 0.0


# ndcg
def get_dcg(y_pred, y_true, k):
    # 注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true": y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    #print("前k个df",df)
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count() + 1) + 1)  # 位置从1开始计数
    h = np.sum(dcg)
    return h


def get_ndcg(y_pred,y_true, k):
    # df包含y_pred和y_true
    #y_pred, y_true=float(y_pred),float(y_true)
    #y_pred, y_true = y_pred.dtype = 'float'y_true.dtype = 'float'
    dcg = get_dcg(y_pred, y_true, k)
    idcg = get_dcg(y_true, y_true, k)
    ndcg = (dcg) / (idcg)
    return ndcg

#测试集每次迭代对应的参数
def ALL_test(all_FM_params,y_test,X_test,alpha):
    t=1
    m,n=np.shape(X_test)
    p_test = []
    r_test = []
    ndcg_test = []
    mes_test = []
    for w0, W, V in all_FM_params:

        predicts_test = FM_predict(X=X_test, w0=w0, W=W, V=V)  # FM模型预测测试集分类结果 80.52%  80.09%
        ndcg = get_ndcg(predicts_test, y_test, 1000)
        precision_score_test=precision_score(y_test, predicts_test)
        recall_score_test=recall_score(y_test, predicts_test)
        with open('学习率为{}test P.csv'.format(alpha), 'a', encoding='gbk') as f:
            f.write('FM第{}次迭代，当前测试集查准率为：{:.4f}'.format(t, precision_score_test) + '\n')
        with open('学习率为{}test R.csv'.format(alpha), 'a', encoding='gbk') as f:
            f.write('FM第{}次迭代，当前测试集查全率为：{:.4f}'.format(t, recall_score_test) + '\n')
        with open('学习率为{}test ndcg.csv'.format(alpha), 'a', encoding='gbk') as f:
            f.write('FM第{}次迭代，当前测试集ndcg为：{:.4f}'.format(t, ndcg) + '\n')
        total_loss_test = 0  # 当前迭代模型的损失值
        for i in range(m):  # 遍历训练集
            y_hat = FM(Xi=X_test[i], w0=w0, W=W, V=V)  # FM的模型方程
            total_loss_test += logit(y=y_test[i], y_hat=y_hat)  # 计算logit损失函数值
        with open('学习率为{}test mes.csv'.format(alpha),'a',encoding='gbk') as f:
            f.write('FM第{}次迭代，当前测试集损失值为：{:.4f}'.format(t, total_loss_test /m)+'\n')
        if t==20:
             print('学习率为{}的FM第{}次迭代，当前测试集查准率为：{:.2%}'.format(alpha,t,precision_score_test))
             print('学习率为{}的FM第{}次迭代，当前测试集查全率为：{:.2%}'.format(alpha, t, recall_score_test))
             print('学习率为{}的FM第{}次迭代，当前测试集ndcg为：{:.4f}'.format(alpha, t, ndcg))
             print('学习率为{}的FM第{}次迭代，当前测试集损失值为：{:.4f}'.format(alpha, t, total_loss_test /m))
        if t != 0:
            p_test.append(precision_score_test)
            r_test.append(recall_score_test)
            ndcg_test.append(ndcg)
            mes_test .append(total_loss_test /m)

        t = t + 1
    return p_test ,r_test ,ndcg_test ,mes_test


#训练集每次迭代对应的参数
def ALL_train(all_FM_params,y_train,X_train,alpha):
    t=1
    m, n = np.shape(X_train)
    p_train = []
    r_train = []
    ndcg_train= []
    mes_train = []
    for w0, W, V in all_FM_params:
        predicts_train= FM_predict(X=X_train, w0=w0, W=W, V=V)  # FM模型预测测试集分类结果 80.52%  80.09%
        ndcg = get_ndcg(predicts_train, y_train, 1000)
        precision_score_train=precision_score(y_train, predicts_train)
        recall_score_train=recall_score(y_train, predicts_train)
        with open('学习率为{}train P.csv'.format(alpha), 'a', encoding='gbk') as f:
            f.write('FM第{}次迭代，当前训练集查准率为：{:.4f}'.format(t, precision_score_train) + '\n')
        with open('学习率为{}train R.csv'.format(alpha), 'a', encoding='gbk') as f:
            f.write('FM第{}次迭代，当前训练集查全率为：{:.4f}'.format(t, recall_score_train) + '\n')
        with open('学习率为{}train ndcg.csv'.format(alpha), 'a', encoding='gbk') as f:
            f.write('FM第{}次迭代，当前训练集ndcg为：{:.4f}'.format(t, ndcg) + '\n')
        total_loss_train = 0  # 当前迭代模型的损失值
        for i in range(m):  # 遍历训练集
            y_hat = FM(Xi=X_train[i], w0=w0, W=W, V=V)  # FM的模型方程
            total_loss_train += logit(y=y_train[i], y_hat=y_hat)  # 计算logit损失函数值
        with open('学习率为{}train mes.csv'.format(alpha),'a',encoding='gbk') as f:
                f.write('FM第{}次迭代，当前训练集损失值为：{:.4f}'.format(t, total_loss_train / m)+'\n')
        if t==20:
             print('学习率为{}的FM第{}次迭代，当前训练集查准率为：{:.2%}'.format(alpha,t, precision_score_train))
             print('学习率为{}的FM第{}次迭代，当前训练集查全率为：{:.2%}'.format(alpha, t, recall_score_train))
             print('学习率为{}的FM第{}次迭代，当前训练集ndcg为：{:.4f}'.format(alpha, t, ndcg))
             print('学习率为{}的FM第{}次迭代，当前测试集损失值为：{:.4f}'.format(alpha, t, total_loss_train / m))
        if t != 0:
            p_train.append(precision_score_train)
            r_train.append(recall_score_train)
            ndcg_train.append(ndcg)
            mes_train .append(total_loss_train /m)
        t = t + 1
    return p_train, r_train, ndcg_train, mes_train

def Merge(dict1, dict2):
    return(dict2.update(dict1))

if __name__ == '__main__':
    star = datetime.now()
    from pandas.core.frame import DataFrame

    new_data = DataFrame()
    i = [0.5,0.25,0,0.1,0.5,0.01,0.05,0.001,0.005]
    p_test_2={}
    r_test_2 = {}
    ndcg_test_2 = {}
    mes_test_2 = {}
    p_train_2 = {}
    r_train_2 = {}
    ndcg_train_2 = {}
    mes_train_2 = {}
    for alpha in i:
        np.random.seed(123)
        #df = pd.read_csv('c.csv')
        #df['Class'] = df['Class'].map({0: -1, 1: 1})  # 标签列从[0, 1]离散到[-1, 1]
        rnames = ['user_id', 'movie_id', 'rating', 'timestamp']  # 这里就是设置数据集每列的标签
        df = pd.read_csv('u1.test', sep='::', header=None, names=rnames, engine='python')
        df.dtype = 'float'
        # 构造2分类数据集
        df=df.drop('timestamp',axis=1)
        df.iloc[1:, -1] = df.iloc[1:, -1].map(lambda x: -1.0 if x >= 4 else 1.0)  # 1,2是label=1  3，4,5是label=-1
        X_train, X_test, y_train, y_test = train_test_split(df.iloc[1:, :-1].values, df.iloc[1:, -1].values, test_size = 0.3, random_state = 123)
        #更换后的归一化处理
        X_train = MinMaxScaler().fit_transform(X_train)  # 归一化训练集，返回[0, 1]区间
        X_test = MinMaxScaler().fit_transform(X_test)  # 归一化测试集，返回[0, 1]区间
        '''*****************FM预测模型*****************'''
        all_FM_params = FM_SGD(X=X_train, y=y_train, k=5, alpha=alpha, iter=300)  # SGD更新FM模型的参数列表：[w0, W, V]
        w0, W, V = all_FM_params[-1]  # FM模型的参数列表
        predicts_test = FM_predict(X=X_test, w0=w0, W=W, V=V)  # FM模型预测测试集分类结果 80.52%  80.09%
        predicts_train=FM_predict(X=X_train, w0=w0, W=W, V=V)
        print('学习率为{}的FM在测试集的分类准确率为: {:.2%}'.format(alpha,accuracy_score(y_test,predicts_test)))
        print('学习率为{}的FM在训练集的分类准确率为: {:.2%}'.format(alpha,accuracy_score(y_train, predicts_train)))
        p_test ,r_test ,ndcg_test ,mes_test =ALL_test(all_FM_params,  y_test, X_test,alpha)
        p_test_1 = {alpha:p_test}  # dict函数就是创建一个字典，zip函数矩阵中的元素对应打包成一个元组列表
        r_test_1= {alpha:r_test}
        ndcg_test_1= {alpha:ndcg_test}
        mes_test_1= {alpha:mes_test}
        Merge(p_test_1, p_test_2)#字典合并
        Merge(r_test_1, r_test_2)  # 字典合并
        Merge(ndcg_test_1, ndcg_test_2)  # 字典合并
        Merge(mes_test_1, mes_test_2)  # 字典合并
        p_train ,r_train ,ndcg_train ,mes_train =ALL_train(all_FM_params, y_train,X_train,alpha)
        p_train_1 = {alpha: p_train}  # dict函数就是创建一个字典，zip函数矩阵中的元素对应打包成一个元组列表
        r_train_1 = {alpha: r_train}
        ndcg_train_1 = {alpha: ndcg_train}
        mes_train_1 = {alpha: mes_train}
        Merge(p_train_1, p_train_2)  # 字典合并
        Merge(r_train_1, r_train_2)  # 字典合并
        Merge(ndcg_train_1, ndcg_train_2)  # 字典合并1
        Merge(mes_train_1, mes_train_2)  # 字典合并
    p_test_3=DataFrame(p_test_2)#字典转表格
    p_test_3.to_csv("P_test表格.csv",index=True,sep=",")
    r_test_3 = DataFrame(r_test_2)  # 字典转表格
    r_test_3.to_csv("r_test表格.csv", index=True, sep=",")
    ndcg_test_3 = DataFrame(ndcg_test_2)  # 字典转表格
    ndcg_test_3.to_csv("ndcg_test表格.csv", index=True, sep=",")
    mes_test_3 = DataFrame(mes_test_2)  # 字典转表格
    mes_test_3.to_csv("mes_test表格.csv", index=True, sep=",")
    p_train_3 = DataFrame(p_train_2)  # 字典转表格
    p_train_3.to_csv("P_train表格.csv", index=True, sep=",")
    r_train_3 = DataFrame(r_train_2)  # 字典转表格
    r_train_3.to_csv("r_train表格.csv", index=True, sep=",")
    ndcg_train_3 = DataFrame(ndcg_train_2)  # 字典转表格
    ndcg_train_3.to_csv("ndcg_train表格.csv", index=True, sep=",")
    mes_train_3 = DataFrame(mes_train_2)  # 字典转表格
    mes_train_3.to_csv("mes_train表格.csv", index=True, sep=",")
    end = datetime.now()
    print("程序运行了:",end-star,"秒")